In [1]:
import lasio
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor # for KNN regression
import matplotlib.pyplot as plt # for data visualization
import plotly.express as px # for data visualization
In [2]:
# Drop the logs LTEN, MINMK, AVTX, BVTX, and ITT
import lasio
# Load the LAS file
las = lasio.read("1054310680.LAS")
# Create a dataframe from the well log data
well = las.df()
# Drop the logs LTEN, MINMK, AVTX, BVTX, and ITT
logs_to_drop = ["LTEN", "MINMK", "AVTX", "BVTX", "ITT", "SPOR", "CNPOR", "DPOR",'DCAL','SP','PE','MELCAL','RHOB','RHOC']
df = well.drop(columns=logs_to_drop, errors='ignore')
# Drop rows with missing values
df = df.dropna(how="any")
# Print the shape and check for any missing values
print(df.shape)
print(df.isnull().sum())
(3409, 10) TBHV 0 ABHV 0 RXORT 0 RILD 0 RILM 0 RLL3 0 GR 0 DT 0 MEL15 0 MEL20 0 dtype: int64
In [3]:
df
Out[3]:
| TBHV | ABHV | RXORT | RILD | RILM | RLL3 | GR | DT | MEL15 | MEL20 | |
|---|---|---|---|---|---|---|---|---|---|---|
| DEPT | ||||||||||
| 1400.0 | 591.1821 | 309.9621 | -33.2771 | 5.6551 | 6.2686 | 13.2490 | 57.4060 | 72.2373 | 32.4823 | 22.6156 |
| 1400.5 | 590.9929 | 309.8554 | -48.0554 | 5.9328 | 7.3321 | 20.2866 | 52.1534 | 68.3856 | 36.9438 | 28.6958 |
| 1401.0 | 590.8032 | 309.7481 | -45.3393 | 6.2462 | 7.9173 | 19.9243 | 52.5652 | 67.2499 | 40.4439 | 32.9060 |
| 1401.5 | 590.6128 | 309.6403 | -28.3571 | 6.5754 | 7.8046 | 13.5833 | 58.3222 | 68.8124 | 34.3923 | 25.4983 |
| 1402.0 | 590.4225 | 309.5325 | -17.2047 | 6.8085 | 7.3175 | 10.5734 | 67.5882 | 72.1245 | 24.9305 | 14.5826 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3102.0 | 0.7963 | 0.3838 | -4.6366 | 11.6097 | 10.7584 | 13.0719 | 34.0328 | 63.0116 | 19.2898 | 14.8824 |
| 3102.5 | 0.6370 | 0.3070 | -6.4359 | 10.5467 | 9.8785 | 12.4345 | 36.2984 | 63.6701 | 17.8834 | 13.6291 |
| 3103.0 | 0.4773 | 0.2298 | -10.7346 | 9.5389 | 9.1828 | 12.5537 | 36.2811 | 64.0720 | 17.7503 | 13.5456 |
| 3103.5 | 0.3178 | 0.1528 | -14.2869 | 8.6482 | 8.4517 | 12.4643 | 35.2736 | 64.2317 | 17.7073 | 13.5098 |
| 3104.0 | 0.1587 | 0.0762 | -14.0149 | 7.9253 | 7.6452 | 11.3432 | 34.4259 | 64.2915 | 17.6699 | 13.4919 |
3409 rows × 10 columns
In [4]:
import pandas as pd
import matplotlib.pyplot as plt
# Calculate the Pearson correlation coefficients
correlations = df.corr()['DT']
# Remove the correlation between 'GR' and itself
correlations = correlations.drop('DT')
# Plot the correlations
plt.figure(figsize=(10, 6))
correlations.plot(kind='bar')
plt.xlabel('Logs')
plt.ylabel('Pearson Correlation Coefficient')
plt.title('Pearson Correlation Coefficients between DT and Other Logs')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [5]:
# Define brief names and units for each log
# Define brief names and units for each log
log_brief_units = {
'TBHV': 'FT3',
'ABHV': 'FT3',
'GR': 'GAPI',
'RILD': 'Ω-m',
'RILM': 'Ω-m',
'RLL3': 'Ω-m',
'CILD': 'MMHO/M',
'CILM': 'MMHO/M ',
'CLL3': 'MMHO/M',
'RXORT': 'Ω-m',
'SP': 'mV',
'DCAL': 'inches',
'DPOR': 'PU',
'PE': 'unitless',
'RHOB': 'g/cc',
'RHOC': 'g/cc',
'MEL15': 'Ω-m',
'MEL20': 'Ω-m',
'DT': 'μs/ft',
'MELCAL': 'inches'
}
def plot_all_logs_with_brief_units_descending(data):
"""Plots multiple well logs with brief units, ensuring depth is in descending order.
Args:
data: A pandas DataFrame containing well log data. The index should represent depth.
Returns:
None
"""
# Ensure the depth data is in descending order
data_sorted = data.sort_index(ascending=False)
num_logs = len(data_sorted.columns)
fig, axes = plt.subplots(nrows=1, ncols=num_logs, figsize=(num_logs * 3, 25), sharey=True, dpi=500)
for ax, log in zip(axes, data_sorted.columns):
# Apply log scale for resistivity and micrologs if necessary
if log in ['RILD', 'RILM', 'RLL3', 'MEL15', 'MEL20']:
ax.plot(np.log10(data_sorted[log]), data_sorted.index, label=log)
else:
ax.plot(data_sorted[log], data_sorted.index, label=log)
# Set the title to the log name with brief unit
ax.set_title(f"{log} ({log_brief_units[log]})", fontsize=30)
ax.grid()
# Increase the font size of the x-axis ticks (log values)
ax.tick_params(axis='x', labelsize=25)
# Set the y-axis label and invert it for descending depth
axes[0].set_ylabel("Depth (ft)", fontsize=35)
axes[0].invert_yaxis()
# Increase the font size of the y-axis ticks (depth numbers)
axes[0].tick_params(axis='y', labelsize=25)
plt.tight_layout()
plt.show()
# Example usage
# Assuming you have a pandas DataFrame named 'df' containing your well log data
plot_all_logs_with_brief_units_descending(df)
In [6]:
df.describe()
Out[6]:
| TBHV | ABHV | RXORT | RILD | RILM | RLL3 | GR | DT | MEL15 | MEL20 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 | 3409.000000 |
| mean | 291.530846 | 150.879273 | -36.330953 | 6.675208 | 7.136873 | 19.769958 | 74.716949 | 79.528714 | 22.982292 | 18.621075 |
| std | 168.437966 | 87.268945 | 26.366208 | 7.105292 | 10.934110 | 30.025319 | 36.455734 | 13.285295 | 15.214074 | 17.093753 |
| min | 0.158700 | 0.076200 | -118.551300 | 0.578800 | 0.613900 | 2.420100 | 17.597800 | 51.511000 | 2.270400 | 1.507400 |
| 25% | 146.818300 | 76.450900 | -51.670300 | 3.603400 | 3.495600 | 6.896500 | 45.722800 | 67.564500 | 14.342700 | 7.550700 |
| 50% | 292.423600 | 151.771200 | -23.666600 | 4.483000 | 4.455100 | 9.246800 | 77.643700 | 83.744300 | 18.099700 | 13.277400 |
| 75% | 432.684900 | 221.749900 | -17.792100 | 6.895400 | 6.942300 | 21.662700 | 97.618200 | 88.251400 | 25.508600 | 22.875300 |
| max | 591.182100 | 309.962100 | 11.621100 | 85.174000 | 181.763200 | 674.902200 | 520.418600 | 111.715400 | 85.618800 | 132.627800 |
In [7]:
# Drop the logs LTEN, MINMK, AVTX, BVTX, and ITT
logs_to_drop = ["LTEN", "MINMK", "AVTX", "BVTX", "ITT", "SPOR", "CNPOR", "DPOR",'DCAL','SP','PE','MELCAL','RHOB','RHOC']
df = well.drop(columns=logs_to_drop, errors='ignore')
# Drop rows with missing values
df = df.dropna(how="any")
# Print the shape and check for any missing values
print(df.shape)
print(df.isnull().sum())
(3409, 10) TBHV 0 ABHV 0 RXORT 0 RILD 0 RILM 0 RLL3 0 GR 0 DT 0 MEL15 0 MEL20 0 dtype: int64
In [8]:
from sklearn.model_selection import train_test_split
train , test = train_test_split(df, test_size = 0.3)
# Use all available features for training
x_train = train.drop('DT', axis=1) # Features excluding the target
y_train = train['DT'] # Target variable
x_test = test.drop('DT', axis=1) # Features for testing
y_test = test['DT'] # Target variable
In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train_scaled)
x_test_scaled = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test_scaled)
In [10]:
#import required packages
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
In [11]:
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_val = [] # To store RMSE values for different k
for K in range(1, 20): # k values from 1 to 20
model = neighbors.KNeighborsRegressor(n_neighbors=K)
model.fit(x_train, y_train) # Fit the model
pred = model.predict(x_test) # Make predictions on test set
error = sqrt(mean_squared_error(y_test, pred)) # Calculate RMSE
rmse_val.append(error) # Store RMSE values
print('RMSE value for k=', K, 'is:', error)
RMSE value for k= 1 is: 3.4194896717493375 RMSE value for k= 2 is: 3.1716217478044477 RMSE value for k= 3 is: 3.1310420022274017 RMSE value for k= 4 is: 3.143750576329555 RMSE value for k= 5 is: 3.1786921564660036 RMSE value for k= 6 is: 3.2373700583496583 RMSE value for k= 7 is: 3.2975132016834134 RMSE value for k= 8 is: 3.3578073594289233 RMSE value for k= 9 is: 3.3865898653990234 RMSE value for k= 10 is: 3.4634356573953977 RMSE value for k= 11 is: 3.5628731665787274 RMSE value for k= 12 is: 3.585420293996214 RMSE value for k= 13 is: 3.638362584975556 RMSE value for k= 14 is: 3.6918202394921416 RMSE value for k= 15 is: 3.731988452535047 RMSE value for k= 16 is: 3.7884207968686083 RMSE value for k= 17 is: 3.8336088321771844 RMSE value for k= 18 is: 3.873115125781757 RMSE value for k= 19 is: 3.921092997209119
In [12]:
# import matplotlib.pyplot as plt
# Create a DataFrame for the RMSE values
curve = pd.DataFrame(rmse_val, columns=['RMSE'])
curve.index += 1 # Start index at 1 for k values
# Plotting the RMSE values
plt.figure(figsize=(10, 6))
plt.plot(curve.index, curve['RMSE'], marker='o')
plt.title('RMSE vs. Number of Neighbors (k)')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('RMSE')
plt.xticks(curve.index) # Show all k values on x-axis
plt.grid()
plt.show()
In [42]:
modelR = KNeighborsRegressor(n_neighbors=3, #default=3
weights='uniform', #{‘uniform’, ‘distance’} or callable, default='uniform'
algorithm='auto', #{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
#leaf_size=30, #default=30, Leaf size passed to BallTree or KDTree.
#p=2, #default=2, Power parameter for the Minkowski metric.
#metric='minkowski', #default=’minkowski’, with p=2 is equivalent to the standard Euclidean metric.
metric_params=None, #dict, default=None, Additional keyword arguments for the metric function.
n_jobs=-1 #default=None, The number of parallel jobs to run for neighbors search, -1 means using all processors
)
In [44]:
reg = modelR.fit(x_train, y_train)
In [45]:
# Predict on training data
pred_values_tr = modelR.predict(x_train)
# Predict on a test data
pred_values_te = modelR.predict(x_test)
In [48]:
# Basic info about the model
print("")
print('****************** KNN Regression ******************')
print("")
scoreR_te = modelR.score(x_test, y_test)
print('Test Accuracy Score: ', scoreR_te)
scoreR_tr = modelR.score(x_train, y_train)
print('Training Accuracy Score: ', scoreR_tr)
print('---------------------------------------------------------')
****************** KNN Regression ****************** Test Accuracy Score: 0.9421505741956218 Training Accuracy Score: 0.9763987330543987 ---------------------------------------------------------
In [50]:
# Create a copy of each dataframe before modifying
df_train_new=train.copy()
df_test_new=test.copy()
# ------- Training DataFrame -------
# Attach predicted class labels and values
df_train_new['Predicted DT']=pred_values_tr
# ------- Test DataFrame -------
# Attach predicted class labels and values
df_test_new['Predicted DT']=pred_values_te
# ------- Combined DataFrame -------
# Combine training and testing dataframes back into one
df_new=pd.concat([df_train_new, df_test_new], ignore_index=False, axis=0, sort=False)
data= df_new.sort_index(ascending=True)
data
Out[50]:
| TBHV | ABHV | RXORT | RILD | RILM | RLL3 | GR | DT | MEL15 | MEL20 | Predicted DT | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| DEPT | |||||||||||
| 1400.0 | 591.1821 | 309.9621 | -33.2771 | 5.6551 | 6.2686 | 13.2490 | 57.4060 | 72.2373 | 32.4823 | 22.6156 | 68.259967 |
| 1400.5 | 590.9929 | 309.8554 | -48.0554 | 5.9328 | 7.3321 | 20.2866 | 52.1534 | 68.3856 | 36.9438 | 28.6958 | 66.933933 |
| 1401.0 | 590.8032 | 309.7481 | -45.3393 | 6.2462 | 7.9173 | 19.9243 | 52.5652 | 67.2499 | 40.4439 | 32.9060 | 66.933933 |
| 1401.5 | 590.6128 | 309.6403 | -28.3571 | 6.5754 | 7.8046 | 13.5833 | 58.3222 | 68.8124 | 34.3923 | 25.4983 | 69.334233 |
| 1402.0 | 590.4225 | 309.5325 | -17.2047 | 6.8085 | 7.3175 | 10.5734 | 67.5882 | 72.1245 | 24.9305 | 14.5826 | 73.154167 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3102.0 | 0.7963 | 0.3838 | -4.6366 | 11.6097 | 10.7584 | 13.0719 | 34.0328 | 63.0116 | 19.2898 | 14.8824 | 63.584567 |
| 3102.5 | 0.6370 | 0.3070 | -6.4359 | 10.5467 | 9.8785 | 12.4345 | 36.2984 | 63.6701 | 17.8834 | 13.6291 | 63.584567 |
| 3103.0 | 0.4773 | 0.2298 | -10.7346 | 9.5389 | 9.1828 | 12.5537 | 36.2811 | 64.0720 | 17.7503 | 13.5456 | 63.584567 |
| 3103.5 | 0.3178 | 0.1528 | -14.2869 | 8.6482 | 8.4517 | 12.4643 | 35.2736 | 64.2317 | 17.7073 | 13.5098 | 63.584567 |
| 3104.0 | 0.1587 | 0.0762 | -14.0149 | 7.9253 | 7.6452 | 11.3432 | 34.4259 | 64.2915 | 17.6699 | 13.4919 | 63.584567 |
3409 rows × 11 columns
In [52]:
def plotter():
# Create a single plot
fig, ax = plt.subplots(figsize=(6,20))
# Define the logs and colors
logs = ['DT', 'Predicted DT']
colors = ['red', 'Green']
# Plot both curves on the same axis
for log, color in zip(logs, colors):
ax.plot(data[log], data.index, color=color, label=log)
# Set labels and invert y-axis
ax.invert_yaxis()
ax.set_xlabel("μs/ft")
ax.set_ylabel("Depth (ft)")
# Add a grid and legend
ax.grid()
ax.legend()
# Call the function to plot
plotter()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: